library(ggplot2)
library(dplyr)
library(statsr)
library(psych)
library(knitr)load("~/Desktop/Coursera/Statistics with R/Linear regression and modeling/Final Project/movies.Rdata")This dataset is comprised of 651 randomly sampled movies produced and released before 2016.
We have no details about how the movies were selected, thus we cannot be sure the sampling procedure to be completely unbiased.
Despite this, we will assume that our findings will be generalizable to the population of movies released prior to 2016 in the United States.
This study is an observational one, there are no experimental condition thus it is impossible to infer about causality. We will draw conclusions about the association between variables.
You can find the codebook of the dataset here
These are the five new variables
knitr::kable(movies %>%
mutate(feature_film = ifelse(movies$title_type == "Feature Film", "yes", "no"),
drama = ifelse(movies$genre == "Drama", "yes", "no"),
mpaa_rating_R = ifelse(movies$mpaa_rating == "R", "yes", "no"),
oscar_season = ifelse(movies$thtr_rel_month > 9, "yes", "no"),
summer_season = ifelse(movies$thtr_rel_month %in% c(5,6,7,8), "yes","no")) %>%
select(feature_film, drama, mpaa_rating_R, oscar_season, summer_season) %>%
head(10))| feature_film | drama | mpaa_rating_R | oscar_season | summer_season |
|---|---|---|---|---|
| yes | yes | yes | no | no |
| yes | yes | no | no | no |
| yes | no | yes | no | yes |
| yes | yes | no | yes | no |
| yes | no | yes | no | no |
| no | no | no | no | no |
| yes | yes | no | no | no |
| yes | yes | yes | yes | no |
| no | no | no | no | no |
| yes | yes | no | no | no |
The aim is to perform exploratory data analysis (EDA) of the relationship between audience_score and the new variables constructed in the previous part
The following plots are boxplots with the variables in relationship with the audience_score. The plots show the median and the IQR of the audience score in each of the categories.
df <- movies %>%
mutate(feature_film = ifelse(movies$title_type == "Feature Film", "yes", "no"),
drama = ifelse(movies$genre == "Drama", "yes", "no"),
mpaa_rating_R = ifelse(movies$mpaa_rating == "R", "yes", "no"),
oscar_season = ifelse(movies$thtr_rel_month > 9, "yes", "no"),
summer_season = ifelse(movies$thtr_rel_month %in% c(5,6,7,8), "yes","no"))library(plotly)
p1 <- plot_ly(df, x = ~feature_film, y = ~audience_score, type = "box", name = 'Feature film')
p2 <- plot_ly(df, x = ~drama, y = ~audience_score, type = "box", name = 'Drama film')
p3 <- plot_ly(df, x = ~mpaa_rating_R, y = ~audience_score, type = "box", name = 'R rated film')
p <- subplot(p1, p2, p3)
pp4 <- plot_ly(df, x = ~oscar_season, y = ~audience_score, type = "box", name = 'Oscar Season')
p5 <- plot_ly(df, x = ~summer_season, y = ~audience_score, type = "box", name = 'Summer Season')
p1 <- subplot(p4, p5)
p1We have to develop a Bayesian regression model to predict audience_score from the following explanatory variables. Note that some of these variables are in the original dataset provided, and others are new variables you constructed earlier:feature_film,drama, runtime, mpaa_rating_R, thtr_rel_year, oscar_season, summer_season, imdb_rating, imdb_num_votes, critics_score, best_pic_nom, best_pic_win, best_actor_win, best_actress_win, best_dir_win, top200_box
We run a bayesian model with an uniform model prior and with Markov Chain Monte Carlo simulation to estimate the posterior probabilities
newdf <- df %>%
select(audience_score, feature_film, drama, runtime, mpaa_rating_R, thtr_rel_year,oscar_season,
summer_season, imdb_rating, imdb_num_votes, critics_score, best_pic_nom, best_pic_win,
best_actor_win, best_actress_win, best_dir_win, top200_box)
library(BAS)
model <- bas.lm(audience_score ~., data = newdf, prior = "ZS-null", modelprior = uniform(),
method = "MCMC", MCMC.iterations = 10000)
round(t(summary(model)),3) Intercept feature_filmyes dramayes runtime mpaa_rating_Ryes
P(B != 0 | Y) 1 0.061 0.039 0.427 0.197
model 1 1 0.000 0.000 0.000 0.000
model 2 1 0.000 0.000 1.000 0.000
model 3 1 0.000 0.000 0.000 0.000
model 4 1 0.000 0.000 0.000 1.000
model 5 1 0.000 0.000 1.000 1.000
thtr_rel_year oscar_seasonyes summer_seasonyes imdb_rating
P(B != 0 | Y) 0.084 0.072 0.076 1
model 1 0.000 0.000 0.000 1
model 2 0.000 0.000 0.000 1
model 3 0.000 0.000 0.000 1
model 4 0.000 0.000 0.000 1
model 5 0.000 0.000 0.000 1
imdb_num_votes critics_score best_pic_nomyes best_pic_winyes
P(B != 0 | Y) 0.045 0.892 0.114 0.029
model 1 0.000 1.000 0.000 0.000
model 2 0.000 1.000 0.000 0.000
model 3 0.000 1.000 0.000 0.000
model 4 0.000 1.000 0.000 0.000
model 5 0.000 1.000 0.000 0.000
best_actor_winyes best_actress_winyes best_dir_winyes
P(B != 0 | Y) 0.171 0.147 0.065
model 1 0.000 0.000 0.000
model 2 0.000 0.000 0.000
model 3 1.000 0.000 0.000
model 4 0.000 0.000 0.000
model 5 0.000 0.000 0.000
top200_boxyes BF PostProbs R2 dim logmarg
P(B != 0 | Y) 0.047 NA NA NA NA NA
model 1 0.000 1.000 0.158 0.752 3 444
model 2 0.000 0.870 0.136 0.755 4 444
model 3 0.000 0.224 0.039 0.754 4 442
model 4 0.000 0.222 0.039 0.754 4 442
model 5 0.000 0.206 0.027 0.756 5 442
plot(model, which = 4)According to our model, only the intercept, imdb_rating and critics_score have a posterior probabilty higher than .50 so we decide to include them into the final model; we decide to include also the runtime variable to improve predictions, even if its posterior probability is below .5 and the Bayes Factor suggest that the best model is the one with only imdb_rating and critics_score
The final model will be
\(Y_i = \beta_0 + \beta_1imdb\) _ \(rating\) + \(\beta_2critics\) _ \(score\) + \(\beta_3runtime\) + \(\epsilon_i\)
final <- bas.lm(audience_score ~ imdb_rating + critics_score + runtime, data = newdf, prior = "ZS-null", modelprior = uniform(), method = "MCMC", MCMC.iterations = 10000)
par(mfrow = c(1,2))
diagnostics(final)par(mfrow = c(2,2))
plot(coef(final))We decide to do a prediction with the film “Dunkirk” of Christopher Nolan.
The film has an imdb_rating = 8.2, critics_score = 92 and runtime = 120
predict(final, estimator = "BPM", newdata = data.frame(imdb_rating = 8.2, critics_score = 92, runtime = 120))$fit
[1] 89.9
attr(,"model")
[1] 0 1 2
attr(,"best")
[1] 6
$Ybma
[,1]
[1,] 89.7
$Ypred
[,1]
[1,] 89.9
[2,] 89.6
[3,] 89.5
[4,] 89.9
[5,] 65.0
[6,] 80.2
[7,] 62.3
$postprobs
[1] 0.4681 0.4184 0.0658 0.0475 0.0001 0.0001 0.0000
$se.fit
NULL
$se.pred
NULL
$se.bma.fit
NULL
$se.bma.pred
NULL
$df
[1] 649
$best
[1] 6
$bestmodel
[1] 0 1 2
$prediction
[1] FALSE
$estimator
[1] "BPM"
attr(,"class")
[1] "pred.bas"
The value with the higher posterior probability is 89.9 which is much higher than the real audience score of 81%. However, we cannot be sure the sampling procedure to be completely unbiased, thus the parameters of our model could be biased too. Finally, we have to remember that this study is observational, thus, even if the variables of our model are most likely predictors of audience score, it is impossible to infer about causality.